# Pathways to External Citizenship: 
# The Global Extension of Dual Citizenship and Voting From Abroad
# Sebastián Umpierrez de Reguero 
# Maarten Vink 

# Data preparation

# Clear work space
rm(list=ls(all=TRUE))

# Set project directory with new Project function in RStudio

# required libraries
library(tidyverse)
library(haven)
library(openxlsx)
library(dplyr)
library(ggseqplot)
library(ggplot2)
library(colorspace)
library(ggh4x)
library(patchwork)
library(gridExtra)
library(gridtext)
library(grid)
library(WDI)
library(dataverse)
library(stats)
library(naniar)
library(democracyData)

# Import EVRR data
# download data from: https://doi.org/10.7910/DVN/DIJQ3H
evrr0 <-
  get_dataframe_by_name(
    filename    = "EVRR_Dataset.tab",
    dataset     = "10.7910/DVN/DIJQ3H",
    server      = "dataverse.harvard.edu"
  ) 
#13910 obs of 50 vars
# select data
evrr <- evrr0 |>
  filter(year > 1959) |> 
  filter(year < 2021) |> 
  filter(!is.na(evrr_dejure)) |> # remove years when countries are not yet independent
  mutate(iso3 = wdicode) |> #rename wdicode into iso3 to facilitate merging later
  select(iso3, country, year, evrr_dejure, evrr_defacto) # select relevant variables
View(evrr) # 10309 obs of 5 variables

#######rename country to ensure matching with GLOBALCIT data
evrr$country <- gsub("Sao Tome", "Sao Tome and Principe", evrr$country)
evrr$country <- gsub("Saint Kitts & Nevis", "Saint Kitts and Nevis", evrr$country)
evrr$country <- gsub("USA", "United States of America", evrr$country)
evrr$country <- gsub("Congo", "Congo Republic of the", evrr$country)
evrr$country <- gsub("Saint Vincent & the Grenadines", "Saint Vincent and the Grenadines", evrr$country)
#######recode iso3
evrr$iso3 <- gsub("KOS", "XKX", evrr$iso3)
evrr$iso3 <- gsub("MNT", "MNE", evrr$iso3)
evrr$iso3 <- gsub("ROM", "ROU", evrr$iso3)
evrr$iso3 <- gsub("SER", "SRB", evrr$iso3)
evrr$iso3 <- gsub("SSU", "SSD", evrr$iso3)
evrr$iso3 <- gsub("ZAR", "COD", evrr$iso3)

# add missing 2020 year obs for Kuwait
evrr <- evrr |>
  add_row(iso3 = "KWT", country = "Kuwait", year = 2020,
          evrr_dejure = 0, evrr_defacto = 0)

#identifying missing data and univariate analyses
summary(evrr) 
str(evrr)
is.na(evrr$evrr_dejure)
sum(is.na(evrr$evrr_dejure)) # 0
is.na(evrr$evrr_defacto)
sum(is.na(evrr$evrr_defacto)) #202
mean(evrr$evrr_dejure, na.rm = TRUE)  #0.3455
mean(evrr$evrr_defacto, na.rm = TRUE) #0.2653
x1 <- evrr |> filter(is.na(evrr$evrr_defacto))
### to avoid NA's impute de facto = 0 when de jure = 0
evrr <- evrr |>
  mutate(evrr_defacto = ifelse(evrr_dejure == 0, 0, evrr_defacto))
sum(is.na(evrr$evrr_defacto)) #111 missing de facto
mean(evrr$evrr_dejure, na.rm = TRUE)  #0.3455
mean(evrr$evrr_defacto, na.rm = TRUE) #0.2624
x2 <- evrr |> filter(is.na(evrr$evrr_defacto))
# further checks
table(evrr$country, evrr$evrr_dejure) ###all 195 countries sampled register a 0/1 response
table(evrr$country, evrr$evrr_defacto) ###Eritrea, East Germany, Somalia, South Sudan are excluded if NA.omit
summary(evrr, evrr$country)
library(mice)
md.pattern(evrr)
count(evrr, country, evrr_dejure)
count(evrr, country, evrr_defacto)
xtabs(~ evrr_dejure + country, data=evrr, addNA = TRUE)
xtabs(~ evrr_defacto + country, data=evrr, addNA = TRUE)
# plot frequence distirbution 0 and 1 for all countries
library(lessR)
BarChart(evrr_dejure, data=evrr, by1=iso3)
BarChart(evrr_defacto, data=evrr, by1=iso3)

# dual citizenship data
# download GLOBALCIT Citizenship Law v2.zip from https://cadmus.eui.eu/handle/1814/73190
# save in project direct file 'data_v2.0_country-year.csv' from the Data folder in the zip file
# import file with longitudinal data on dual citizenship
dualcit <- read_csv('data_v2.0_country-year.csv') |>
  # transform L05_bin into a variable that is coded as 1 for dual citizenship acceptance
  # and 0 for no dual citizenship
  # similar to Vink et al, 2024, A global panel dataset of dyadic dual citizenship acceptance
  # https://doi.org/10.1177/01979183241305388
  mutate(ldc_bin = ifelse(L05_bin == 1, 0, # any restriction
                           ifelse(L05_bin == 0, 1, NA))) |>
  mutate(ldc2_bin = ifelse(L05_cat == 1 | L05_cat == 2, 0, # consistent restriction (only automatic loss)
                      ifelse(L05_cat == 0 | L05_cat == 3 | L05_cat == 4 | L05_cat == 5 | L05_cat == 6, 1, NA))) |>
  select(iso3, year, ldc_bin, ldc2_bin)|>
  filter(year < 2021) |> 
  filter(!is.na(ldc2_bin)) 
View(dualcit)
summary(dualcit) #10137 obs of 4 variables
# iso3                year         ldc_bin          ldc2_bin     
# Length:10137       Min.   :1960   Min.   :0.0000   Min.   :0.0000  
# Class :character   1st Qu.:1979   1st Qu.:0.0000   1st Qu.:0.0000  
# Mode  :character   Median :1994   Median :0.0000   Median :1.0000  
#                    Mean   :1993   Mean   :0.4382   Mean   :0.5623  
#                    3rd Qu.:2008   3rd Qu.:1.0000   3rd Qu.:1.0000  
#                    Max.   :2020   Max.   :1.0000   Max.   :1.0000  

# rename iso3c for predecessor states Czechoslovakia, Soviet Union, Yugoslovia 
# this is important to match historical data between evrr and dual cit data
# note that this also follows VDEM approach
# note for transparancy we keep the origin country names (so important to merge on iso3!)
dualcit$iso3 <- gsub("CSK", "CZE", dualcit$iso3)
dualcit$iso3 <- gsub("YUG", "SRB", dualcit$iso3)
dualcit$iso3 <- gsub("SUN", "RUS", dualcit$iso3)

#identifying missing data and univariate analyses
str(dualcit)
is.na(dualcit$ldc2_bin)
sum(is.na(dualcit$ldc2_bin)) #0
mean(dualcit$ldc2_bin, na.rm = TRUE) #0.56
table(dualcit$iso3, dualcit$ldc2_bin) ###all 201 countries sampled register a 0/1 response
summary(dualcit, dualcit$iso3) ### This dataset includes Yugoslavia, Vatican City, Soviet Union, Serbia and Montenegro, Czechoslovakia, GDR
md.pattern(dualcit) # No need for mice. This data set is completely observed.
xtabs(~ ldc2_bin + iso3, data=dualcit, addNA = TRUE)
# plot frequency distribution 0 and 1 for all countries
BarChart(ldc2_bin, data=dualcit, by1=iso3) 

###
# merge two data frames by iso3, year and country
total1 <- merge(evrr, dualcit,by=c("iso3","year", "iso3"), all = T) |>
  filter(!is.na(evrr_dejure))  # filter out years not covered by EVRR dataset
# 10310 obs of 7 vars

# check what happens when restrict to only matching rows
# total1b <- merge(evrr, dualcit,by=c("iso3","year", "iso3"), all = F) |>
#   filter(!is.na(evrr_dejure)) # filter out years not covered by EVRR dataset
# 9983 obs of 7 vars

#check
summary(total1) # 10309 obs of 6 vars (was: 9704) obs
table(total1$iso3) #max 61 year observations
table(total1$year) #max 195 country observations
sum(is.na(total1$evrr_dejure)) # [1] 0
sum(is.na(total1$evrr_defacto)) # [1] 111
sum(is.na(total1$ldc_bin)) # [1] 327 #missing early years for some countries
sum(is.na(total1$ldc2_bin)) # [1] 327 #missing early years for some countries

# Add ext cit (dj) variable with any dual citizenship restriction
total2 <- total1 |>
  mutate(extcit_dj = ifelse(evrr_dejure == 0 & ldc_bin == 0, "0",
                             ifelse(evrr_dejure == 1 & ldc_bin == 0, "1", 
                                    ifelse(evrr_dejure == 0 & ldc_bin == 1, "2",
                                           ifelse(evrr_dejure == 1 & ldc_bin == 1, "3",
                                                  NA)))))
table(total2$extcit_dj)
#  0    1    2    3 
# 3998 1637 2452 1896 
summary(total2)  

# Add ext cit (df) variable
total2 <- total2 |>
  mutate(extcit_df = ifelse(evrr_defacto == 0 & ldc_bin == 0, "0",
                             ifelse(evrr_defacto == 1 & ldc_bin == 0, "1", 
                                    ifelse(evrr_defacto == 0 & ldc_bin == 1, "2",
                                           ifelse(evrr_defacto == 1 & ldc_bin == 1, "3",
                                                  NA)))))
table(total2$extcit_df)
# 0    1    2    3 
# 4406 1155 2808 1508 

# ext cit (dj and df)
total2 <- total2 |>
  mutate(extcit_djf = ifelse(evrr_dejure == 0 & evrr_defacto == 0 & ldc_bin == 0, "0",
                             ifelse(evrr_dejure == 1 & evrr_defacto == 0 & ldc_bin == 0, "1", #only de jure
                                    ifelse(evrr_dejure == 1 & evrr_defacto == 1 & ldc_bin == 0, "2", #also de facto
                                           ifelse(evrr_dejure == 0 & evrr_defacto == 0 & ldc_bin == 1, "3", #only dual cit
                                                  ifelse(evrr_dejure == 1 & evrr_defacto == 0 & ldc_bin == 1, "4", #de jure and dual cit
                                                         ifelse(evrr_dejure == 1 & evrr_defacto == 1 & ldc_bin == 1, "5", #de facto and dual cit
                                                                NA)))))))
table(total2$extcit_djf)
# 0    1    2    3    4    5 
# 3998  408 1155 2452  356 1508 



# Add ext cit2 (dj) variable with only consistent dual citizenship restriction
total2 <- total2 |>
  mutate(extcit2_dj = ifelse(evrr_dejure == 0 & ldc2_bin == 0, "0",
                            ifelse(evrr_dejure == 1 & ldc2_bin == 0, "1", 
                                   ifelse(evrr_dejure == 0 & ldc2_bin == 1, "2",
                                          ifelse(evrr_dejure == 1 & ldc2_bin == 1, "3",
                                                 NA)))))
table(total2$extcit2_dj)
# 0    1    2    3 
# 3184 1194 3266 2339 
summary(total2)

# Add ext cit2 (df) variable
total2 <- total2 |>
  mutate(extcit2_df = ifelse(evrr_defacto == 0 & ldc2_bin == 0, "0",
                            ifelse(evrr_defacto == 1 & ldc2_bin == 0, "1", 
                                   ifelse(evrr_defacto == 0 & ldc2_bin == 1, "2",
                                          ifelse(evrr_defacto == 1 & ldc2_bin == 1, "3",
                                                 NA)))))
table(total2$extcit2_df)
# 0    1    2    3 
# 3506  838 3708 1825 

# ext cit2 (dj and df)
total2 <- total2 |>
  mutate(extcit2_djf = ifelse(evrr_dejure == 0 & evrr_defacto == 0 & ldc2_bin == 0, "0",
                             ifelse(evrr_dejure == 1 & evrr_defacto == 0 & ldc2_bin == 0, "1", #only de jure
                                    ifelse(evrr_dejure == 1 & evrr_defacto == 1 & ldc2_bin == 0, "2", #also de facto
                                           ifelse(evrr_dejure == 0 & evrr_defacto == 0 & ldc2_bin == 1, "3", #only dual cit
                                                  ifelse(evrr_dejure == 1 & evrr_defacto == 0 & ldc2_bin == 1, "4", #de jure and dual cit
                                                         ifelse(evrr_dejure == 1 & evrr_defacto == 1 & ldc2_bin == 1, "5", #de facto and dual cit
                                                                NA)))))))
table(total2$extcit2_djf)
# 0    1    2    3    4    5 
# 3184  322  838 3266  442 1825 


# Add covariates

# import vdem directly with vdem R package + rename iso3 for country id
load(url("https://github.com/vdeminstitute/vdemdata/raw/master/data/vdem.RData"))
vd <- vdem %>% 
  filter(year > 1959) %>% 
  filter(year < 2021) %>% 
  mutate(
    iso3 = country_text_id) |>
  group_by(iso3) |>
  mutate(
    mean_v2x_regime = mean(v2x_regime, na.rm = TRUE)) |>
  mutate(
    mean_v2x_polyarchy = mean(v2x_polyarchy, na.rm = TRUE)) |>
  mutate(
    mean_v2x_libdem = mean(v2x_libdem, na.rm = TRUE)) |>
  ungroup() |>
  select(iso3, country_name, year, v2x_regime, v2x_libdem, v2x_polyarchy, 
         mean_v2x_regime, mean_v2x_polyarchy, mean_v2x_libdem)
View(vd) ## This dataset includes Hong Kong, GDR, Palestine, Rep of Vietnam, Zanzibar
summary(vd) # 10192 obs of 9 vars
#remove South Yemen (only included 1960-1990)
vd <- vd %>% filter(iso3 != 'YMD')
# 10161 obs

# check distribution v2x_regime by country
BarChart(v2x_regime, data=vd, by1=iso3) #This dataset excludes Andorra, Antigua and Barbuda, Palau, Saint Vincent and the Grenadines, 
# Tuvalu, Tonga, Samoa, Bahamas, Belize, Brunei, Dominica, Liechtenstein, Kiribati, Monaco, Saint Lucia, Nauru, Marshall Islands, 
# Saint Kitts and Nevis
mean(vd$v2x_libdem, na.rm = TRUE)
mean(vd$v2x_polyarchy, na.rm = TRUE)
mean(vd$v2x_regime, na.rm = TRUE)

##merge VD data with main dataset
total3 <- merge(total2, vd,by=c("iso3","year"), all.x = T) |>
  filter(!is.na(evrr_dejure)) # filter out years not covered by EVRR dataset
#10310 obs
# NB all = F, default, will remove all microstates which are not covered by VDEM! if want to include, add "all=T" to merge function
summary(total3) #useful to identify NA's by variable: 934 NA's for v2x_regime

# add Freedom House data
# https://xmarquez.github.io/democracyData/index.html
remotes::install_github("xmarquez/democracyData")
library(democracyData)
#https://xmarquez.github.io/democracyData/reference/download_fh.html
# download FH data
fh0 <- download_fh(verbose = FALSE) # 9045 obs of 11 vars
# selected relevant years and variables and add iso code
fh1 <- country_year_coder(
  fh0,
  fh_country,
  year,
  code_type = "cown",
  to_system = c("cow"),
  match_type = c("country and code"),
  include_in_output = c("iso3c"),
  verbose = TRUE,
  debug = FALSE,
  match_final_year = FALSE
)

#recode CSK into CZE
fh1$iso3c[fh1$iso3c == 'CSK'] <- "CZE"

# select data
fh <- fh1 |>
  filter(year > 1959) |>
  filter(year < 2021) |> 
  mutate(iso3 = iso3c) |>
  filter(!is.na(iso3)) |>
  group_by(iso3) |>
  mutate(
    mean_fh_tot_rev = mean(fh_total_reversed)) |>
  ungroup() |>
  select(iso3, fh_country, year, status, fh_total_reversed, mean_fh_tot_rev)
# 8653 obs of 7 vars
#explore distribution by country
BarChart(status, data=fh, by1=iso3)

# rename iso3c for predecessor states Czechoslovakia, Soviet Union, Yugoslovia 
# this is important to match historical data between evrr and dual cit data
# note that this also follows VDEM approach
# note for transparency we keep the origin country names (so important to merge on iso3!)
fh$iso3 <- gsub("CSK", "CZE", fh$iso3)
fh$iso3 <- gsub("YUG", "SRB", fh$iso3)
fh$iso3 <- gsub("SUN", "RUS", fh$iso3)

#remove duplicate rows (there are some duplicates for SRB)
fh2 <- distinct(fh)

#merge FH data with main dataset
total4 <- merge(total3, fh2, by=c("iso3", "year"), all = T) |>
  filter(!is.na(evrr_dejure)) # filter out years not covered by EVRR dataset
# 10309 obs of 29 vars

#### adding world bank data
wd <- WDI(country = "all", indicator = c("SP.POP.TOTL", "NY.GDP.PCAP.KD"), 
          start = 1960, end = 2020, extra = TRUE) |>
  filter(!is.na(iso3c)) |>
  mutate(iso3 = iso3c) |>
  group_by(iso3) |>
  mutate(
    mean_pop = mean(SP.POP.TOTL, na.rm=TRUE)) |> # calculate mean population over all years
  mutate(
    mean_pop_log = log(mean_pop)) |> # log to deal with extremes
  mutate(
    mean_gdp = mean(NY.GDP.PCAP.KD, na.rm=TRUE)) |> # calculate mean GDP over all years
  mutate(
    mean_gdp_log = log(mean_gdp)) |> # log to deal with extremes
  ungroup() |>
  select(iso3, year, SP.POP.TOTL, NY.GDP.PCAP.KD, income, mean_pop, mean_pop_log, mean_gdp, mean_gdp_log)
summary(wd)
# check all missing
wd %>% summarise(across(everything(), ~ sum(!is.na(.))))
# A tibble: 1 × 9
# iso3  year    SP.POP.TOTL   NY.GDP.PCAP.KD  income  mean_pop
# 16226 16226   16135         13245           15433   16226
# ℹ 3 more variables: mean_pop_log <int>, mean_gdp <int>,
#   mean_gdp_log <int>

#impute 'High' for Czech Republic (missing from wdi)
wd$income[wd$iso3=='CZE' & wd$year > 2005] <- 'High income'

#merge wdi data with main dataset
total5 <- merge(total4, wd, by = c("iso3", "year"),
              all.x = TRUE) |>
  filter(!is.na(evrr_dejure)) # filter out years not covered by EVRR dataset
# 10310 obs of 36 vars

# ensure no unnecessary NA's in time-invariant covariates
total6 <- total5 |>
  arrange(year) |>
  group_by(iso3) |>
  fill(mean_v2x_regime, .direction = "up") |>
  fill(mean_v2x_polyarchy, .direction = "up") |>
  fill(mean_v2x_libdem, .direction = "up") |>
  fill(mean_fh_tot_rev, .direction = "up") |>
  fill(mean_pop, .direction = "up") |>
  fill(mean_pop_log, .direction = "up") |>
  fill(mean_gdp, .direction = "up") |>
  fill(mean_gdp_log, .direction = "up") |>
  ungroup()

# save file
write.csv(total6, file = "pathways_data.csv", row.names = FALSE)
library(pastecs)
stat.desc(total6)

# impute missing values ldc_bin
sum(is.na(total6$ldc_bin)) #[1] 327
total7 <- total6 |>
  arrange(year) |>
  group_by(iso3) |>
  fill(ldc_bin, .direction = "up") |>
  ungroup()
sum(is.na(total7$ldc_bin)) #[1] 0

# Add ext cit (dj) variable
total8 <- total7 |>
  mutate(extcit_dj = ifelse(evrr_dejure == 0 & ldc_bin == 0, "0",
                            ifelse(evrr_dejure == 1 & ldc_bin == 0, "1", 
                                   ifelse(evrr_dejure == 0 & ldc_bin == 1, "2",
                                          ifelse(evrr_dejure == 1 & ldc_bin == 1, "3",
                                                 NA)))))
table(total8$extcit_dj)
# 0    1    2    3 
# 4175 1659 2573 1903   # total: 10310

# Add ext cit (df) variable
total8 <- total8 |>
  mutate(extcit_df = ifelse(evrr_defacto == 0 & ldc_bin == 0, "0",
                            ifelse(evrr_defacto == 1 & ldc_bin == 0, "1", 
                                   ifelse(evrr_defacto == 0 & ldc_bin == 1, "2",
                                          ifelse(evrr_defacto == 1 & ldc_bin == 1, "3",
                                                 NA)))))
table(total8$extcit_df)
# 0    1    2    3 
# 4592 1164 2931 1512 

# ext cit (dj and df)
total8 <- total8 |>
  mutate(extcit_djf = ifelse(evrr_dejure == 0 & evrr_defacto == 0 & ldc_bin == 0, "0",
                             ifelse(evrr_dejure == 1 & evrr_defacto == 0 & ldc_bin == 0, "1", #only de jure
                                    ifelse(evrr_dejure == 1 & evrr_defacto == 1 & ldc_bin == 0, "2", #also de facto
                                           ifelse(evrr_dejure == 0 & evrr_defacto == 0 & ldc_bin == 1, "3", #only dual cit
                                                  ifelse(evrr_dejure == 1 & evrr_defacto == 0 & ldc_bin == 1, "4", #de jure and dual cit
                                                         ifelse(evrr_dejure == 1 & evrr_defacto == 1 & ldc_bin == 1, "5", #de facto and dual cit
                                                                NA)))))))
table(total8$extcit_djf)
# 0    1    2    3    4    5 
# 4175  417 1164 2573  358 1512 


# impute missing values ldc2_bin with consistent dc restriction
sum(is.na(total8$ldc2_bin)) #[1] 327
total9 <- total8 |>
  arrange(year) |>
  group_by(iso3) |>
  fill(ldc2_bin, .direction = "up") |>
  ungroup()
sum(is.na(total9$ldc2_bin)) #[1] 0

# Add ext cit (dj) variable
total10 <- total9 |>
  mutate(extcit2_dj = ifelse(evrr_dejure == 0 & ldc2_bin == 0, "0",
                            ifelse(evrr_dejure == 1 & ldc2_bin == 0, "1", 
                                   ifelse(evrr_dejure == 0 & ldc2_bin == 1, "2",
                                          ifelse(evrr_dejure == 1 & ldc2_bin == 1, "3",
                                                 NA)))))
table(total10$extcit2_dj)
# 0    1    2    3 
# 3297 1200 3451 2362  # total: 10310

# Add ext cit (df) variable
total10 <- total10 |>
  mutate(extcit2_df = ifelse(evrr_defacto == 0 & ldc2_bin == 0, "0",
                            ifelse(evrr_defacto == 1 & ldc2_bin == 0, "1", 
                                   ifelse(evrr_defacto == 0 & ldc2_bin == 1, "2",
                                          ifelse(evrr_defacto == 1 & ldc2_bin == 1, "3",
                                                 NA)))))
table(total10$extcit2_df)
# 0    1    2    3 
# 3620  843 3903 1833 #total: 10310

# ext cit (dj and df)
total10 <- total10 |>
  mutate(extcit2_djf = ifelse(evrr_dejure == 0 & evrr_defacto == 0 & ldc2_bin == 0, "0",
                             ifelse(evrr_dejure == 1 & evrr_defacto == 0 & ldc2_bin == 0, "1", #only de jure
                                    ifelse(evrr_dejure == 1 & evrr_defacto == 1 & ldc2_bin == 0, "2", #also de facto
                                           ifelse(evrr_dejure == 0 & evrr_defacto == 0 & ldc2_bin == 1, "3", #only dual cit
                                                  ifelse(evrr_dejure == 1 & evrr_defacto == 0 & ldc2_bin == 1, "4", #de jure and dual cit
                                                         ifelse(evrr_dejure == 1 & evrr_defacto == 1 & ldc2_bin == 1, "5", #de facto and dual cit
                                                                NA)))))))
table(total10$extcit2_djf)
# 0    1    2    3    4    5 
# 3297  323  843 3451  452 1833 

# Save imputed dual cit data
write.csv(total10, file = "pathways_data_imp.csv", row.names = FALSE)

